In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
import warnings
warnings.filterwarnings('ignore')
In [2]:
import warnings

def function_that_warns():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    function_that_warns()  # this will not show a warning
In [3]:
path = '~/Desktop/MSCA 31015 DS for consulting/Assignments/Final assignment/'
df = pd.read_excel(path+'Final Assignment - Week 9.xlsx')

Examine the dataset

In [4]:
df.head(10)
Out[4]:
id logtarg r f m tof Ffiction1 Fclassics3 Fcartoons5 Flegends6 ... Mhealth35 Mcooking36 Mlearning37 MGamesRiddles38 Msports39 Mhobby40 Mnature41 Mencyclopaedia44 Mvideos50 Mnonbooks99
0 914 NaN 194 7 318.894775 1703 1 0 1 0 ... 14.899994 10.174706 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.0
1 957 NaN 3 14 368.052246 2364 1 0 2 0 ... 74.156250 0.000000 22.749985 0.000000 0.0 61.176025 17.150269 10.000000 0.000000 0.0
2 1406 NaN 1489 15 423.298340 2371 0 0 0 0 ... 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.0
3 1414 NaN 155 4 71.217041 1290 0 0 0 0 ... 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.0
4 1546 NaN 194 6 442.638184 2188 0 0 1 0 ... 41.518219 0.000000 7.949997 0.000000 0.0 17.895203 0.000000 0.000000 61.799988 0.0
5 1651 NaN 1797 2 47.933594 1808 0 0 0 0 ... 0.000000 20.400543 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.0
6 2046 NaN 383 3 52.399658 2186 0 0 0 0 ... 0.000000 0.000000 0.000000 9.949997 0.0 0.000000 15.313187 0.000000 0.000000 0.0
7 2534 NaN 80 7 119.282043 2063 0 0 0 0 ... 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.0
8 2801 NaN 96 9 164.324341 2453 1 1 0 0 ... 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.0
9 3689 NaN 705 5 137.147461 1839 0 0 0 0 ... 0.000000 6.949997 0.000000 0.000000 0.0 12.725838 10.123573 8.666389 0.000000 0.0

10 rows × 66 columns

In [5]:
df.shape
Out[5]:
(33713, 66)

Descriptive Statistics

In [6]:
selected_col = ['r','f','m']
df[selected_col].describe()
Out[6]:
r f m
count 33713.000000 33713.000000 33713.000000
mean 508.427254 5.820603 251.924325
std 542.440039 7.008049 3929.929826
min 0.000000 0.000000 0.000000
25% 117.000000 1.000000 39.899994
50% 293.000000 3.000000 102.629944
75% 712.000000 7.000000 247.297485
max 2460.000000 118.000000 532892.000000

Histogram

In [7]:
df.r.hist(color = 'steelblue', bins = 20)
plt.title('Recency')
Out[7]:
Text(0.5, 1.0, 'Recency')
In [8]:
df.f.hist(color = 'steelblue', bins = 20)
plt.title('Frequency')
Out[8]:
Text(0.5, 1.0, 'Frequency')
In [9]:
df.m.hist(color = 'steelblue', bins = 20)
plt.title('Revenue')
Out[9]:
Text(0.5, 1.0, 'Revenue')

Creating a new dataset

In [10]:
df2 = df[['id','r','f','m','tof']]
df2.head(10)
Out[10]:
id r f m tof
0 914 194 7 318.894775 1703
1 957 3 14 368.052246 2364
2 1406 1489 15 423.298340 2371
3 1414 155 4 71.217041 1290
4 1546 194 6 442.638184 2188
5 1651 1797 2 47.933594 1808
6 2046 383 3 52.399658 2186
7 2534 80 7 119.282043 2063
8 2801 96 9 164.324341 2453
9 3689 705 5 137.147461 1839

Customer Overview

In [11]:
# get all unique vistor ids 
all_customers = df2.id.unique()
all_customers.size
Out[11]:
33713
In [12]:
# no repeated customers in the dataset
In [13]:
#get all the customers who bought something
customer_purchased = df2[df2.m != 0].id.sort_values().unique()
customer_purchased.size
Out[13]:
32822
In [14]:
customer_browsed = [x for x in all_customers if x not in customer_purchased]
len(customer_browsed)
Out[14]:
891
In [15]:
#There are 891 customers did not buy in the dataset.

Recency

In [16]:
from sklearn.cluster import KMeans

sse={}
tx_recency = df2[['r']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_recency)
    tx_recency["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_ 
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
In [17]:
# choose k = 4
In [19]:
#build 4 clusters for recency and add it to dataframe
kmeans = KMeans(n_clusters=4)
kmeans.fit(df2[['r']])
df2['RecencyCluster'] = kmeans.predict(df2[['r']])

#function for ordering cluster numbers
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

df2 = order_cluster('RecencyCluster', 'r',df2,False)
In [20]:
df2.groupby('RecencyCluster')['r'].describe()
Out[20]:
count mean std min 25% 50% 75% max
RecencyCluster
0 2571.0 1870.782186 261.007885 1503.0 1643.0 1810.0 2075.0 2460.0
1 4966.0 1134.143174 187.961932 828.0 971.0 1110.0 1293.0 1502.0
2 8385.0 520.432677 140.683016 327.0 388.0 514.0 626.0 825.0
3 17791.0 131.237367 93.210817 0.0 44.0 128.0 198.0 326.0

0 are the most inactive customers and 3 are the most recent customers

Frequency

In [21]:
# apply k-means based on frequency
kmeans = KMeans(n_clusters=4)
kmeans.fit(df2[['f']])
df2['FrequencyCluster'] = kmeans.predict(df2[['f']])

#order the frequency cluster
df2 = order_cluster('FrequencyCluster', 'f',df2,True)
In [22]:
#see details of each cluster
df2.groupby('FrequencyCluster')['f'].describe()
Out[22]:
count mean std min 25% 50% 75% max
FrequencyCluster
0 22172.0 2.305791 1.374109 0.0 1.0 2.0 3.0 5.0
1 8558.0 8.880112 2.460605 6.0 7.0 8.0 11.0 14.0
2 2618.0 20.208556 4.640130 15.0 16.0 19.0 23.0 32.0
3 365.0 44.394521 13.525636 33.0 35.0 39.0 48.0 118.0

high frequency number indicates better customers

Revenue

In [23]:
#apply clustering
kmeans = KMeans(n_clusters=4)
kmeans.fit(df2[['m']])
df2['RevenueCluster'] = kmeans.predict(df2[['m']])


#order the cluster numbers
df2 = order_cluster('RevenueCluster', 'm',df2,True)
In [24]:
#show details of the dataframe
df2.groupby('RevenueCluster')['m'].describe()
Out[24]:
count mean std min 25% 50% 75% max
RevenueCluster
0 33709.0 212.894339 371.457002 0.000 39.899994 102.607666 247.209473 18582.53125
1 2.0 200214.000000 74287.400995 147684.875 173949.437500 200214.000000 226478.562500 252743.12500
2 1.0 383349.500000 NaN 383349.500 383349.500000 383349.500000 383349.500000 383349.50000
3 1.0 532892.000000 NaN 532892.000 532892.000000 532892.000000 532892.000000 532892.00000

Overall Score

In [25]:
#calculate overall score and use mean() to see details
df2['OverallScore'] = df2['RecencyCluster'] + df2['FrequencyCluster'] + df2['RevenueCluster']
df2.groupby('OverallScore')['r','f','m'].mean()
Out[25]:
r f m
OverallScore
0 1883.857324 2.022794 82.697624
1 1173.271470 2.608443 101.674278
2 601.991870 3.174797 117.575043
3 216.650815 3.587436 127.320787
4 157.766928 9.915716 398.624784
5 106.114461 20.842675 769.282124
6 64.746224 44.676737 2168.072104
7 130.000000 14.500000 458120.750000

The scoring above clearly shows us that customers with score 7 is our best customers whereas 0 is the worst.

In [26]:
df2['Segment'] = 'Low-Value'
df2.loc[df2['OverallScore']>2,'Segment'] = 'Mid-Value' 
df2.loc[df2['OverallScore']>4,'Segment'] = 'High-Value'
In [27]:
#Revenue vs Frequency
graph = df2.query("m < 50000 and f < 2000")

plot_data = [
    go.Scatter(
        x=graph.query("Segment == 'Low-Value'")['f'],
        y=graph.query("Segment == 'Low-Value'")['m'],
        mode='markers',
        name='Low',
        marker= dict(size= 7,
            line= dict(width=1),
            color= 'blue',
            opacity= 0.8
           )
    ),
        go.Scatter(
        x=graph.query("Segment == 'Mid-Value'")['f'],
        y=graph.query("Segment == 'Mid-Value'")['m'],
        mode='markers',
        name='Mid',
        marker= dict(size= 9,
            line= dict(width=1),
            color= 'green',
            opacity= 0.5
           )
    ),
        go.Scatter(
        x=graph.query("Segment == 'High-Value'")['f'],
        y=graph.query("Segment == 'High-Value'")['m'],
        mode='markers',
        name='High',
        marker= dict(size= 11,
            line= dict(width=1),
            color= 'red',
            opacity= 0.9
           )
    ),
]

plot_layout = go.Layout(
        yaxis= {'title': "Revenue"},
        xaxis= {'title': "Frequency"},
        title='Segments'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
In [28]:
#Revenue vs Recency
graph = df2.query("m < 50000 and f < 2000")

plot_data = [
    go.Scatter(
        x=graph.query("Segment == 'Low-Value'")['f'],
        y=graph.query("Segment == 'Low-Value'")['m'],
        mode='markers',
        name='Low',
        marker= dict(size= 7,
            line= dict(width=1),
            color= 'blue',
            opacity= 0.8
           )
    ),
        go.Scatter(
        x=graph.query("Segment == 'Mid-Value'")['r'],
        y=graph.query("Segment == 'Mid-Value'")['m'],
        mode='markers',
        name='Mid',
        marker= dict(size= 9,
            line= dict(width=1),
            color= 'green',
            opacity= 0.5
           )
    ),
        go.Scatter(
        x=graph.query("Segment == 'High-Value'")['r'],
        y=graph.query("Segment == 'High-Value'")['m'],
        mode='markers',
        name='High',
        marker= dict(size= 11,
            line= dict(width=1),
            color= 'red',
            opacity= 0.9
           )
    ),
]

plot_layout = go.Layout(
        yaxis= {'title': "Revenue"},
        xaxis= {'title': "Recency"},
        title='Segments'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
In [29]:
#Frequency vs Recency
graph = df2.query("r < 50000 and f < 2000")

plot_data = [
    go.Scatter(
        x=graph.query("Segment == 'Low-Value'")['r'],
        y=graph.query("Segment == 'Low-Value'")['f'],
        mode='markers',
        name='Low',
        marker= dict(size= 7,
            line= dict(width=1),
            color= 'blue',
            opacity= 0.8
           )
    ),
        go.Scatter(
        x=graph.query("Segment == 'Mid-Value'")['r'],
        y=graph.query("Segment == 'Mid-Value'")['f'],
        mode='markers',
        name='Mid',
        marker= dict(size= 9,
            line= dict(width=1),
            color= 'green',
            opacity= 0.5
           )
    ),
        go.Scatter(
        x=graph.query("Segment == 'High-Value'")['r'],
        y=graph.query("Segment == 'High-Value'")['f'],
        mode='markers',
        name='High',
        marker= dict(size= 11,
            line= dict(width=1),
            color= 'red',
            opacity= 0.9
           )
    ),
]

plot_layout = go.Layout(
        yaxis= {'title': "Frequency"},
        xaxis= {'title': "Recency"},
        title='Segments'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

From the graph we can know the distribution of low, mid and high value customers via different dimensions. We can conclude that we need to improve retention on high-value customers, improve retention and increase frequency on mid-value customers and increase frequency on low-value customers.

In [ ]: